- Load required libraries
library(Boruta)
library(clValid)
library(DESeq2)
library(factoextra)
library(gplots)
library(multtest)
library(NbClust)
library(pvclust)
#library(quantro) #This package was used in R-3.6 version
library(RColorBrewer)
library(rgl)
library(Rtsne)
library(scatterplot3d)
library(viridis)
- Run differential expression analysis using DESeq2 (as suggested in DESeq2’s vignette; http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html)
#Load raw read counts (determined with DuffyNGS)
#All time points have three replicates but T15 (which has six replicates)
counts.hypoxia<-as.matrix(read.csv("hypoxia_raw_reads.csv",row.names=1))
#Replace few NA values with zeroes (0.049%)
na.hypoxia<-which(is.na(counts.hypoxia))
counts.hypoxia[na.hypoxia]<-0
#Print time points/replicates in the expression matrix
print(colnames(counts.hypoxia))
[1] "T0_A" "T0_B" "T0_C" "T1_A" "T1_B" "T1_C" "T2_A" "T2_B" "T2_C" "T3_A" "T3_B" "T3_C"
[13] "T4_A" "T4_B" "T4_C" "T5_A" "T5_B" "T5_C" "T6_A" "T6_B" "T6_C" "T7_A" "T7_B" "T7_C"
[25] "T8_A" "T8_B" "T8_C" "T9_A" "T9_B" "T9_C" "T10_A" "T10_B" "T10_C" "T11_A" "T11_B" "T11_C"
[37] "T12_A" "T12_B" "T12_C" "T13_A" "T13_B" "T13_C" "T14_A" "T14_B" "T14_C" "T15_A" "T15_B" "T15_C"
[49] "T15_D" "T15_E" "T15_F" "T16_A" "T16_B" "T16_C" "T17_A" "T17_B" "T17_C" "T18_A" "T18_B" "T18_C"
[61] "T19_A" "T19_B" "T19_C" "T20_A" "T20_B" "T20_C" "T21_A" "T21_B" "T21_C" "T22_A" "T22_B" "T22_C"
[73] "T23_A" "T23_B" "T23_C" "T24_A" "T24_B" "T24_C" "T25_A" "T25_B" "T25_C"
#Create list with differentially expressed genes (DEGs) at each time point
de.genes<-list()
#Save names of all DEGs
DEGs.timecourse<-c()
#DESeq2 analysis is performed for each time point with respect to T0
for(i in c(2:16,18:27))
{
#For time points different to T15
if(i!=16)
{
count.tab<-counts.hypoxia[,c(1:3,((3*i)-2):(3*i))]
#Round read counts (determined by DuffyNGS)
count.tab<-round(count.tab)
#Metadata (triplicates)
exp.metadata <- data.frame(condition = factor(rep(c("control", "tpoint"), c(3,3))))
rownames(exp.metadata) <- colnames(count.tab)
dds <- DESeqDataSetFromMatrix(count.tab, exp.metadata, ~condition)
#Remove genes with zero reads in all samples
dds <- dds[ rowSums(counts(dds)) > 1, ]
#Run main function
dds <- DESeq(dds)
#Output
output.deseq <- results(dds)
#Save names of DEGs
degs<-rownames(output.deseq)[which(output.deseq$padj < 0.05 & abs(output.deseq$log2FoldChange)>1)]
de.genes[[i]]<-degs
DEGs.timecourse<-union(DEGs.timecourse,degs)
}
#For T15
else
{
count.tab<-counts.hypoxia[,c(1:3,((3*i)-2):((3*i)+3))]
#Round read counts
count.tab<-round(count.tab)
#Metadata
exp.metadata <- data.frame(condition = factor(rep(c("control", "tpoint"), c(3,6))))
rownames(exp.metadata) <- colnames(count.tab)
dds <- DESeqDataSetFromMatrix(count.tab, exp.metadata, ~condition)
#Remove genes with zero reads in all samples
dds <- dds[ rowSums(counts(dds)) > 1, ]
#Run main function
dds <- DESeq(dds)
#Output
output.deseq <- results(dds)
#Save names of DEGs
degs<-rownames(output.deseq)[which(output.deseq$padj < 0.05 & abs(output.deseq$log2FoldChange)>1)]
de.genes[[i]]<-degs
DEGs.timecourse<-union(DEGs.timecourse,degs)
}
}
2a. Normalized full expression matrix and perform hierarchichal clustering on median profiles
#Normalize read counts using all time points and replicates
count.tab<-round(counts.hypoxia)
#Metadata
exp.metadata <- data.frame(condition = factor(sapply(1:ncol(counts.hypoxia),function(x){strsplit(colnames(counts.hypoxia)[x],split = "_")[[1]][1]})))
rownames(exp.metadata) <- colnames(count.tab)
#Estimate normalized counts with DESeq2 (log transformation and normal distribution)
dds <- DESeqDataSetFromMatrix(count.tab, exp.metadata, ~condition)
converting counts to integer mode
dds <- estimateSizeFactors(dds)
normalized.counts <- assay(normTransform(dds))
#Estimate median counts for each gene per time point
time.points<-paste(unique(exp.metadata$condition),"_",sep="")
median.normalized.counts<-c()
for(z in time.points)
{
col.pos<-grep(z,colnames(normalized.counts))
median.normalized.counts<-cbind(median.normalized.counts,apply(FUN=median,MARGIN=1,normalized.counts[,col.pos]))
}
colnames(median.normalized.counts)<-paste("T",0:25,sep="")
#Hierarchical clustering with bootstrap values
hierarchical.clustering.euclidean<-pvclust(data = median.normalized.counts,method.dist ="euclidean")
Bootstrap (r = 0.5)... Done.
Bootstrap (r = 0.6)... Done.
Bootstrap (r = 0.7)... Done.
Bootstrap (r = 0.8)... Done.
Bootstrap (r = 0.9)... Done.
Bootstrap (r = 1.0)... Done.
Bootstrap (r = 1.1)... Done.
Bootstrap (r = 1.2)... Done.
Bootstrap (r = 1.3)... Done.
Bootstrap (r = 1.4)... Done.
plot(hierarchical.clustering.euclidean, print.pv=TRUE, print.num=FALSE, float=0.01,
col.pv=c(2,3,8), cex.pv=0.8,cex=0.8,main="Fig S1B")

2b. Apply PCA to the full normalized expression matrix
#The average of the two samples from each reactor was used for defining T15
normalized.counts.t15_average<-cbind(normalized.counts[,1:45],rowMeans(normalized.counts[,c(46,49)]),
rowMeans(normalized.counts[,c(47,50)]),
rowMeans(normalized.counts[,c(48,51)]), normalized.counts[,52:81])
colnames(normalized.counts.t15_average)[46:48]<-c("T15_A","T15_B","T15_C")
#Run PCA
pca.input<-t(normalized.counts.t15_average)
pca.output<-prcomp(pca.input)
#Plot 3D PCA (for the final Figure S1A we used the interactive plot3d function)
#clusters.colors<-c("blue","deepskyblue2","orange","darkolivegreen1","green3","red")
#stateLabels=c(rep(1,2),rep(2,6),3,3,4,3,rep(4,4),rep(5,3),rep(3,3),rep(6,4))
#gr <- grid3d('z')
#plot3d(pca.output$x[,1:3],col=rep(clusters.colors[stateLabels],each=3),size = 10)
#axes3d(labels=F,tick=F,box=F)
#text3d(pca.output$x[,1:3]+1,texts=rownames(pca.input),cex=1.1)
pca.plot<-scatterplot3d(x = pca.output$x[,1], y = pca.output$x[,3], z = pca.output$x[,2],
pch = 19,angle = -190,xlab = "PC1",ylab="PC3",zlab="PC2",color=rep(viridis(26),each=3),main="Fig S1A")
legend(pca.plot$xyz.convert(20, -40, 30), pch=19, col=viridis(26), legend = unique(exp.metadata$condition), bty="o", cex=.35)

2c. Apply clustering algorithms (hierarchichal clustering, k-means) to the full normalized expression matrix
#Hierarchical clustering with bootstrap values (all replicates)
hierarchical.clustering.euclidean.reps<-pvclust(data = normalized.counts.t15_average,method.dist ="euclidean")
Bootstrap (r = 0.5)... Done.
Bootstrap (r = 0.6)... Done.
Bootstrap (r = 0.7)... Done.
Bootstrap (r = 0.8)... Done.
Bootstrap (r = 0.9)... Done.
Bootstrap (r = 1.0)... Done.
Bootstrap (r = 1.1)... Done.
Bootstrap (r = 1.2)... Done.
Bootstrap (r = 1.3)... Done.
Bootstrap (r = 1.4)... Done.
plot(hierarchical.clustering.euclidean.reps, print.pv=TRUE, print.num=FALSE, float=0.01,
col.pv=c(2,3,8), cex.pv=0.4,cex=0.5,main="Fig S1C")

#K-means with k=2
k.means.2<-kmeans(t(normalized.counts.t15_average),2,nstart = 25)
#Print one of the two k-means clusters
print(names(which(k.means.2$cluster == 1)))
[1] "T8_A" "T8_B" "T9_B" "T9_C" "T11_A" "T11_B" "T16_B" "T17_A" "T19_A" "T19_B" "T19_C" "T20_A"
[13] "T20_B" "T20_C" "T21_A" "T21_B"
#K-means with k=6
k.means.6<-kmeans(t(normalized.counts.t15_average),6,nstart = 25)
#Print replicates clusters (related to Fig S1D)
print(k.means.6$cluster)
T0_A T0_B T0_C T1_A T1_B T1_C T2_A T2_B T2_C T3_A T3_B T3_C T4_A T4_B T4_C T5_A T5_B
6 6 6 6 6 6 3 3 3 3 3 3 3 3 3 3 6
T5_C T6_A T6_B T6_C T7_A T7_B T7_C T8_A T8_B T8_C T9_A T9_B T9_C T10_A T10_B T10_C T11_A
3 3 3 3 3 3 3 4 4 3 1 4 4 1 1 3 4
T11_B T11_C T12_A T12_B T12_C T13_A T13_B T13_C T14_A T14_B T14_C T15_A T15_B T15_C T16_A T16_B T16_C
4 2 1 1 1 1 1 1 1 1 1 1 2 1 2 4 2
T17_A T17_B T17_C T18_A T18_B T18_C T19_A T19_B T19_C T20_A T20_B T20_C T21_A T21_B T21_C T22_A T22_B
4 2 2 2 2 2 4 4 4 4 4 4 4 4 5 5 5
T22_C T23_A T23_B T23_C T24_A T24_B T24_C T25_A T25_B T25_C
5 5 5 5 5 5 5 5 5 5
#Compute silhouette clustering metric using clValid package
clValid.internal.metrics <- clValid(t(normalized.counts.t15_average), nClust = 2:26,clMethods = c("hierarchical","kmeans","diana","pam","clara"), validation = "internal")
#Silhouette values were extracted using the summary(clValid.internal.metrics) command
silhouette.kmeans<-c(0,0.4706,0.2948,0.3043,0.3666,0.3814,0.3553,0.3432,0.3163,0.3136,0.2949,0.2902,
0.2710,0.2555,0.2506,0.2498,0.2303,0.2317,0.2283,0.2277,0.2310,0.2275,0.2250,
0.2216,0.2200,0.2155)
plot(x=1:26,y=silhouette.kmeans,lwd=1.2,type="o",pch=20,xlab="Number of clusters k",ylab="Average silhouette width",cex.lab=1.5,cex=1.5,col="purple",cex.axis=1.5, main="Fig S1E")

2d. Ensemble clustering- Part I
#First we perform ensemble clustering for k-means (with 1< k < 11)
#Number of bootstraps
N=5000
#Create co-clustering matrix
clustering.matrix<-matrix(ncol=26,nrow=26,0)
rownames(clustering.matrix)<-unique(sapply(1:ncol(normalized.counts.t15_average),function(x){strsplit(colnames(normalized.counts.t15_average)[x],split = "_")[[1]][1]}))
colnames(clustering.matrix)<-rownames(clustering.matrix)
for(n in 1:N)
{
#New time series with replicates from different reactors (one random replicate per time point)
permutation.sampling<-c()
for(i in 0:25)
{
#Location of current time point replicates in the normalized expression matrix
potential.point<- (i*3) + (1:3)
#Random selection of single replicate
selected.point<-potential.point[sample(1:3,1)]
#Add selected replicate to time series
permutation.sampling<-cbind(permutation.sampling,normalized.counts.t15_average[,selected.point])
}
colnames(permutation.sampling)<-colnames(clustering.matrix)
#Compute k-means with k in the [2,10] range
for(z in 2:10)
{
temp.kmeans.z<-kmeans(t(permutation.sampling),z,nstart = 25)
#Identify time points that clustered together
for(a in 1:z){
clustered.points<-names(which( temp.kmeans.z$cluster == a))
if(length(clustered.points)>1)
{
for(d in 1:length(clustered.points))
{
for(c in 1:length(clustered.points))
{
clustering.matrix[clustered.points[d],clustered.points[c]]<- clustering.matrix[clustered.points[d],clustered.points[c]]+1
}
}
}
else
{
clustering.matrix[clustered.points,clustered.points]<- clustering.matrix[clustered.points,clustered.points]+1
}
}
}
}
heatmap.2(clustering.matrix/(N*9),Rowv = "NA",dendrogram = "none",Colv="NA",trace = "none",col = colorRampPalette(rev(brewer.pal(20, "PiYG")) )(20),density.info="none",main="Fig S1F")
n too large, allowed maximum for palette PiYG is 11
Returning the palette you asked for with that many colors

2e. Ensemble clustering- Part II
#Now that the existence of six clusters has been confirmed. We perform ensemble clustering for k-means (k=6)
#Number of bootstraps
N=10000
#Create co-clustering matrix
clustering.matrix.200<-matrix(ncol=26,nrow=26,0)
rownames(clustering.matrix.200)<-unique(sapply(1:ncol(normalized.counts.t15_average),function(x){strsplit(colnames(normalized.counts.t15_average)[x],split = "_")[[1]][1]}))
colnames(clustering.matrix.200)<-rownames(clustering.matrix.200)
for(n in 1:N)
{
#New time series with replicates from different reactors (one random replicate per time point)
permutation.sampling<-c()
for(i in 0:25)
{
#Location of current time point replicates in the normalized expression matrix
potential.point<- (i*3) + (1:3)
#Random selection of single replicate
selected.point<-potential.point[sample(1:3,1)]
#Add selected replicate to time series
permutation.sampling<-cbind(permutation.sampling,normalized.counts.t15_average[,selected.point])
}
colnames(permutation.sampling)<-colnames(clustering.matrix.200)
#Run k-means with k=6
temp.kmeans.6<-kmeans(t(permutation.sampling),6,nstart = 25)
#Identify time points that clustered together
for(z in 1:6)
{
clustered.points<-names(which(temp.kmeans.6$cluster == z))
if(length(clustered.points)>1)
{
for(d in 1:length(clustered.points))
{
for(c in 1:length(clustered.points))
{
clustering.matrix.200[clustered.points[d],clustered.points[c]]<- clustering.matrix.200[clustered.points[d],clustered.points[c]]+1
}
}
}
else
{
clustering.matrix.200[clustered.points,clustered.points]<- clustering.matrix.200[clustered.points,clustered.points]+1
}
}
}
clustering.ensemble.200k<-clustering.matrix.200/N
heatmap.2(clustering.ensemble.200k,Rowv = "NA",dendrogram = "none",Colv="NA",trace = "none",col = colorRampPalette(rev(brewer.pal(20, "PiYG")) )(20),density.info="none",main="Fig S1G")
n too large, allowed maximum for palette PiYG is 11
Returning the palette you asked for with that many colors

2f. Compare inter- and intra-group variation using quantro (original run in R-3.6)
#quantro.stateLabels=c(rep("N",6),rep("D",18),rep("L",6),rep("E",3),rep("L",3),rep("E",12),rep("M",9),rep("L",9),rep("R",12))
#Remove genes with zero counts in all replicates
#zero.rows<-which(apply(FUN=max,MARGIN=1,normalized.counts.t15_average)==0)
#normalized.counts.reps<-normalized.counts.t15_average[-1*zero.rows,]
#Run Quantro with 10000 bootstraps
#qtest.normalized <- quantro(object = normalized.counts.reps, groupFactor = quantro.stateLabels,B = 10000)
#Plot result
#quantroPlot(qtest.normalized)
2g. Visualize clusters (states) with tSNE plot
#Run tSNE (using only DEGs to have best resolution)
clusters.colors<-c("blue","deepskyblue2","orange","darkolivegreen1","green3","red")
stateLabels=c(rep(1,2),rep(2,6),3,3,4,3,rep(4,4),rep(5,3),rep(3,3),rep(6,4))
tsne.input<-t(median.normalized.counts[DEGs.timecourse,])
tsne.output=Rtsne(tsne.input,dims=2,perplexity=7,verbose=TRUE)
Read the 26 x 26 data matrix successfully!
Using no_dims = 2, perplexity = 7.000000, and theta = 0.500000
Computing input similarities...
Normalizing input...
Building tree...
- point 0 of 26
Done in 0.00 seconds (sparsity = 0.914201)!
Learning embedding...
Iteration 50: error is 60.864247 (50 iterations in 0.02 seconds)
Iteration 100: error is 55.161538 (50 iterations in 0.01 seconds)
Iteration 150: error is 54.162451 (50 iterations in 0.01 seconds)
Iteration 200: error is 55.908350 (50 iterations in 0.01 seconds)
Iteration 250: error is 58.371151 (50 iterations in 0.01 seconds)
Iteration 300: error is 1.512479 (50 iterations in 0.00 seconds)
Iteration 350: error is 1.051103 (50 iterations in 0.00 seconds)
Iteration 400: error is 0.630091 (50 iterations in 0.00 seconds)
Iteration 450: error is 0.151097 (50 iterations in 0.01 seconds)
Iteration 500: error is 0.130585 (50 iterations in 0.01 seconds)
Iteration 550: error is 0.118387 (50 iterations in 0.01 seconds)
Iteration 600: error is 0.103883 (50 iterations in 0.00 seconds)
Iteration 650: error is 0.098280 (50 iterations in 0.00 seconds)
Iteration 700: error is 0.100665 (50 iterations in 0.00 seconds)
Iteration 750: error is 0.094375 (50 iterations in 0.00 seconds)
Iteration 800: error is 0.095730 (50 iterations in 0.00 seconds)
Iteration 850: error is 0.097345 (50 iterations in 0.01 seconds)
Iteration 900: error is 0.097089 (50 iterations in 0.01 seconds)
Iteration 950: error is 0.091347 (50 iterations in 0.01 seconds)
Iteration 1000: error is 0.095081 (50 iterations in 0.00 seconds)
Fitting performed in 0.13 seconds.
plot(tsne.output$Y,col=clusters.colors[stateLabels],pch=19,xlab='tSNE Component 1',ylab='tSNE Component 2',main="Fig 2A")
text(x=tsne.output$Y[,1],y=tsne.output$Y[,2],rownames(tsne.input))

3a. Define gene clusters based on clusters-average expression profile
#The six states defined in Fig 2A
transcriptional.states<-c("Normoxia","Depletion","Early","Mid","Late","Resuscitation")
#Time points-states association
time.points.clusters<-list(c(1,2),3:8,c(11,13:16),17:19,c(9,10,12,20:22),23:26)
names(time.points.clusters)<-transcriptional.states
gene.clusters<-rep(list(NULL),length(transcriptional.states))
#Assign each DEG to one transcriptional state
for(f in DEGs.timecourse)
{
#Initialize maximum average expression and state membership
max.level<-mean(median.normalized.counts[f,time.points.clusters[[1]]])
gene.state<-1
#Evaluate if the average is higher in any of the other states
for (s in 2:6)
{
temp.mean<-mean(median.normalized.counts[f,time.points.clusters[[s]]])
#Update state and maximum average
if (temp.mean > max.level)
{
gene.state <- s
max.level<-temp.mean
}
}
gene.clusters[[gene.state]]<-c(gene.clusters[[gene.state]],f)
}
names(gene.clusters)<-transcriptional.states
#This function computes bicluster residual using formula from Cheng and Church, 2000
bc.residual<-function(genes,conditions,exp.mat)
{
volume<-length(genes)*length(conditions)
D_IJ<-mean(exp.mat[genes,conditions])
residual<-0
for(f in genes)
{
for(z in conditions)
{
temp.r<-exp.mat[f,z] - mean(exp.mat[f,conditions]) - mean(exp.mat[genes,z]) + D_IJ
residual<- residual + (temp.r)^2
}
}
residual<- residual/volume
residual
}
#This function computes the average Pearson correlation of a group of genes
cluster.correlation<-function(genes)
{
all.correlations<-c()
for(w in 1:(length(genes)-1))
{
for(t in (w+1):length(genes))
{
all.correlations<-c(all.correlations,cor(median.normalized.counts[genes[w],],median.normalized.counts[genes[t],]))
}
}
output<-mean(all.correlations)
output
}
3b. Comparison with Boruta clustering (random forest based approach)
#Each state vs the other five states
bestPartition<-c(0,0,1,1,1,1,1,1,4,4,2,4,2,2,2,2,3,3,3,4,4,4,5,5,5,5)
boruta.signature.genes<-list()
for(z in 0:5)
{
hypoxia.df<-as.data.frame(cbind(bestPartition,t(median.normalized.counts[DEGs.timecourse,])))
hypoxia.df$bestPartition[which(hypoxia.df$bestPartition != z)]<- 100
hypoxia.df$bestPartition<-as.factor(hypoxia.df$bestPartition)
set.seed(123)
boruta.train <- Boruta(bestPartition~., data = hypoxia.df, doTrace = 2)
final.boruta <- TentativeRoughFix(boruta.train)
print(final.boruta)
boruta.signature.genes[[z+1]]<-names(final.boruta$finalDecision[which(final.boruta$finalDecision == "Confirmed")])
}
Boruta performed 99 iterations in 13.59856 secs.
Tentatives roughfixed over the last 99 iterations.
15 attributes confirmed important: Rv0064, Rv0421c, Rv0451c, Rv0713, Rv0755c and 10 more;
2567 attributes confirmed unimportant: Rv0001, Rv0005, Rv0006, Rv0009, Rv0010c and 2562
more;
Boruta performed 99 iterations in 20.47355 secs.
Tentatives roughfixed over the last 99 iterations.
127 attributes confirmed important: Rv0009, Rv0010c, Rv0042c, Rv0055, Rv0096 and 122 more;
2455 attributes confirmed unimportant: Rv0001, Rv0005, Rv0006, Rv0011c, Rv0013 and 2450
more;
Boruta performed 99 iterations in 19.75365 secs.
Tentatives roughfixed over the last 99 iterations.
80 attributes confirmed important: Rv0040c, Rv0115, Rv0147, Rv0207c, Rv0241c and 75 more;
2502 attributes confirmed unimportant: Rv0001, Rv0005, Rv0006, Rv0009, Rv0010c and 2497
more;
Boruta performed 99 iterations in 15.5174 secs.
Tentatives roughfixed over the last 99 iterations.
44 attributes confirmed important: Rv0064, Rv0237, Rv0285, Rv0328, Rv0827c and 39 more;
2538 attributes confirmed unimportant: Rv0001, Rv0005, Rv0006, Rv0009, Rv0010c and 2533
more;
Boruta performed 99 iterations in 31.77508 secs.
Tentatives roughfixed over the last 99 iterations.
254 attributes confirmed important: Rv0032, Rv0077c, Rv0095c, Rv0106, Rv0124 and 249 more;
2328 attributes confirmed unimportant: Rv0001, Rv0005, Rv0006, Rv0009, Rv0010c and 2323
more;
Boruta performed 99 iterations in 17.68268 secs.
Tentatives roughfixed over the last 99 iterations.
94 attributes confirmed important: Rv0065, Rv0113, Rv0125, Rv0251c, Rv0644c and 89 more;
2488 attributes confirmed unimportant: Rv0001, Rv0005, Rv0006, Rv0009, Rv0010c and 2483
more;
names(boruta.signature.genes)<-c("Normoxia","Depletion","Early","Mid","Late","Resuscitation")
#Now we create Table S3
tableS3<-matrix(nrow=length(gene.clusters),ncol=6,NA,dimnames = list(transcriptional.states,c("Genes selected based on mean", "Mean Square Residual","Pearson correlation", "# Boruta genes","Overlap", "Recall-Boruta")))
for(z in rownames(tableS3))
{
current.state.genes<-gene.clusters[[z]]
tableS3[z,1]<-length(current.state.genes)
tableS3[z,2]<-bc.residual(genes = current.state.genes,conditions =time.points.clusters[[z]] ,exp.mat = median.normalized.counts )
tableS3[z,3]<-cluster.correlation(current.state.genes)
tableS3[z,4]<-length(boruta.signature.genes[[z]])
tableS3[z,5]<-length(intersect(boruta.signature.genes[[z]],current.state.genes))
tableS3[z,6]<-round(tableS3[z,5]/tableS3[z,4],digits = 2)
}
Print Table S3
print(tableS3)
Genes selected based on mean Mean Square Residual Pearson correlation # Boruta genes
Normoxia 81 0.04448027 0.2814294 15
Depletion 446 0.01494834 0.7135277 127
Early 328 0.05303711 0.5470933 80
Mid 320 0.07686766 0.4000902 44
Late 978 0.02790105 0.7456892 254
Resuscitation 429 0.08484841 0.4753521 94
Overlap Recall-Boruta
Normoxia 3 0.20
Depletion 73 0.57
Early 61 0.76
Mid 24 0.55
Late 185 0.73
Resuscitation 56 0.60
3c. Compare the identified transcriptional states with previous hypoxia models (Table S1)
#Initial hypoxic response
IHR<-scan("IHR_genes.csv",what="character")
Read 49 items
#Enduring hypoxic response
EHR<-scan("EHR_genes.csv",what="character")
Read 230 items
NRP<-read.csv("nrp_data.csv",header=T,row.names=1)
#Define Non replicating persistence 1 genes
NRP1<-rownames(NRP)[which(NRP$pval1 <= 0.05 & NRP$Ratio1 > 2)]
#Define Non replicating persistence 2 genes
NRP2<-rownames(NRP)[which(NRP$pval2 <= 0.05 & NRP$Ratio2 > 2)]
#Comparison matrix
others.hypoxia.models<-c("IHR","EHR","NRP1","NRP2")
tableS1<-matrix(ncol=8,nrow=6,NA,dimnames = list(transcriptional.states,c("IHR recall","IHR P-value","EHR recall","EHR P-value","NRP1 recall","NRP1 P-value","NRP2 recall","NRP2 P-value")))
for(i in names(gene.clusters))
{
for(j in others.hypoxia.models)
{
overlap<-intersect(gene.clusters[[i]],eval(parse(text=j)))
tableS1[i,paste(j,"recall",sep=" ")]<-length(overlap)/length(eval(parse(text=j)))
pval<-phyper(length(overlap)-1,length(eval(parse(text=j))),nrow(median.normalized.counts)-length(eval(parse(text=j))),length(gene.clusters[[i]]),lower.tail=F)
tableS1[i,paste(j,"P-value",sep=" ")]<-pval
}
}
tableS1<-round(tableS1,digits = 3)
print(tableS1)
IHR recall IHR P-value EHR recall EHR P-value NRP1 recall NRP1 P-value NRP2 recall
Normoxia 0.000 1.000 0.004 0.992 0.016 0.731 0.004
Depletion 0.020 0.997 0.035 1.000 0.107 0.593 0.040
Early 0.408 0.000 0.070 0.779 0.176 0.000 0.138
Mid 0.490 0.000 0.226 0.000 0.187 0.000 0.201
Late 0.041 1.000 0.196 0.963 0.032 1.000 0.125
Resuscitation 0.020 0.996 0.165 0.003 0.316 0.000 0.214
NRP2 P-value
Normoxia 0.991
Depletion 1.000
Early 0.002
Mid 0.000
Late 1.000
Resuscitation 0.000
3d.Define the set of differentialy expressed genes in Galagan et al. (2013) and evaluate recall of differentially expressed genes in intracellular MTB (collected with Path-seq) (Table S2)
#Load Galagan's data (triplicates for each time point)
galagan.raw.data<-read.csv("galagan_raw_hypoxia_ts.csv",row.names=1)
#Perform differential expression analysis with bayesian T-test using Cyber-T (Baldi and Long, 2001)
source("cyberTtest.R")
#List with the output of differential expression analyses
galagan.dea<-list()
#List with DEGs at each time point
galagan.deg<-list()
galagan.DEGs.total<-c()
for(t in 1:6)
{
galagan.dea[[t]]<-bayesT(aData = galagan.raw.data[,c(1:3,(1+(3*t)):(3+(3*t)))],numC = 3,numE = 3,conf = 7,doMulttest = T)
galagan.deg[[t]]<-rownames(galagan.dea[[t]])[which(galagan.dea[[t]]$BH < 0.05 & abs(galagan.dea[[t]]$meanC - galagan.dea[[t]]$meanE)> 1)]
galagan.DEGs.total<-union(galagan.DEGs.total,galagan.deg[[t]])
}
#load list of DEGs identified with Path-seq (Peterson et al, 2019)
intracellular.genes<-scan("DEGs_intracellular_MTB_pathseq.txt",what="character")
Read 932 items
tableS2<-matrix(nrow=4,ncol=3,NA,dimnames=list(c("Controlled O2","Galagan13","Wayne","Defined"),c("Overlap w/ intracellular genes","Recall","P-value")))
tableS2[1,"Overlap w/ intracellular genes"]<-length(intersect(intracellular.genes,DEGs.timecourse))
tableS2[1,"Recall"]<-round(tableS2[1,1]/length(intracellular.genes),digits = 3)
tableS2[1,"P-value"]<-phyper(tableS2[1,1]-1,length(intersect(rownames(median.normalized.counts),intracellular.genes)),nrow(median.normalized.counts)-length(intersect(rownames(median.normalized.counts),intracellular.genes)),length(DEGs.timecourse),lower.tail = F)
tableS2[2,"Overlap w/ intracellular genes"]<-length(intersect(intracellular.genes,galagan.DEGs.total))
tableS2[2,"Recall"]<-round(tableS2[2,1]/length(intracellular.genes),digits = 3)
tableS2[2,"P-value"]<-phyper(tableS2[2,1]-1,length(intersect(rownames(galagan.raw.data),intracellular.genes)),nrow(galagan.raw.data)-length(intersect(rownames(galagan.raw.data),intracellular.genes)),length(galagan.DEGs.total),lower.tail = F)
tableS2[3,"Overlap w/ intracellular genes"]<-length(intersect(intracellular.genes,union(NRP1,NRP2)))
tableS2[3,"Recall"]<-round(tableS2[3,1]/length(intracellular.genes),digits = 3)
#There are 4,094 genes in the Path-seq expression dataset, so we take that number as the total population size in the hypergeometric test
tableS2[3,"P-value"]<-phyper(tableS2[3,1]-1,length(intracellular.genes),4094-length(intracellular.genes),length(union(NRP1,NRP2)),lower.tail = F)
tableS2[4,"Overlap w/ intracellular genes"]<-length(intersect(intracellular.genes,union(IHR,EHR)))
tableS2[4,"Recall"]<-round(tableS2[4,1]/length(intracellular.genes),digits = 3)
tableS2[4,"P-value"]<-phyper(tableS2[4,1]-1,length(intracellular.genes),4094-length(intracellular.genes),length(union(IHR,EHR)),lower.tail = F)
print(tableS2)
Overlap w/ intracellular genes Recall P-value
Controlled O2 731 0.784 1.738885e-30
Galagan13 711 0.763 1.042124e-01
Wayne 115 0.123 1.185192e-10
Defined 117 0.126 2.099120e-14
3d. Create Fig 2B draft
clusters.colors<-c("blue","deepskyblue2","darkolivegreen1","green3","orange","red")
names(clusters.colors)<-transcriptional.states
time.vector<-scan("time_vector.txt")
Read 26 items
time.vector<-round(time.vector,digits = 2)
#We summarized common themes of significant functional terms defined by DAVID (Huang da et al, 2009)
DAVID.enriched.selected.terms<-as.matrix(rbind(c(52,31,17,14,10,10,17,30,36,5,36,14,23,15,8,9),c("Translation","Oxididative phos.","Aa met.","ATP synthesis","ETC","Lipid met.","Aa met.","Response to hypoxia","Transcription regulation","Histidine synthesis",
"PPE","Cobalamin synthesis","Mammalian cell entry","ESAT-6 like*","Proteases","DNA transposition"),rep(transcriptional.states[2:6],times=c(3,3,4,3,3))))
rownames(DAVID.enriched.selected.terms)<-c("Number of genes","Term","State")
#Scale normalized read counts in the [0,1] interval
scale.counts<-function(countMatrix)
{
output<-c()
for(q in rownames(countMatrix))
{
min.counts<-min(countMatrix[q,])
max.counts<-max(countMatrix[q,])
counts.range<-max.counts - min.counts
output<- rbind(output,(countMatrix[q,]-min.counts)/counts.range)
}
rownames(output)<-rownames(countMatrix)
output
}
scaled.normalized.median.counts<-scale.counts(median.normalized.counts)
rownames(scaled.normalized.median.counts)<-rownames(median.normalized.counts)
par(mfrow=c(6,3))
par(mar=c(1,1,1,1))
for(j in 1:6)
{
current.state<-transcriptional.states[j]
mean.profile<-colMeans(scaled.normalized.median.counts[gene.clusters[[j]],])
plot(x=time.vector,y=mean.profile,ylim=c(0,1),main="",xlab="Time",ylab="Scaled counts",type="o",pch=1,col=clusters.colors[j],cex.lab=1,cex.axis=0.7)
text(current.state,x=20,y=0.1)
frame()
if (j == 1)
{
frame()
}
else
{
barplot(as.numeric(DAVID.enriched.selected.terms["Number of genes",which(DAVID.enriched.selected.terms["State",]==current.state)]),cex.lab=1,
names=DAVID.enriched.selected.terms["Term",which(DAVID.enriched.selected.terms["State",]==current.state)],col=rep(clusters.colors[j],length(which(DAVID.enriched.selected.terms["State",]==current.state))),las=2,horiz = T,xlim=c(0,50))
}
}

3e. Create figure similar to Fig 2b but for each reactor
#Define expression matrix for each replicate
repA<-normalized.counts.t15_average[,grep("A",colnames(normalized.counts.t15_average))]
colnames(repA)<-paste("T",0:25,"A",sep="")
repB<-normalized.counts.t15_average[,grep("B",colnames(normalized.counts.t15_average))]
colnames(repB)<-paste("T",0:25,"B",sep="")
repC<-normalized.counts.t15_average[,grep("C",colnames(normalized.counts.t15_average))]
colnames(repC)<-paste("T",0:25,"C",sep="")
repA.scaled<-scale.counts(repA)
repB.scaled<-scale.counts(repB)
repC.scaled<-scale.counts(repC)
par(mfrow=c(6,3))
par(mar=c(1,1,1,1))
for(j in 1:6)
{
current.state<-transcriptional.states[j]
mean.profile.A<-colMeans(repA.scaled[gene.clusters[[j]],])
if(j==1)
{
plot(x=time.vector,y=mean.profile.A,ylim=c(0,1),main="Replicate A",xlab="Time",ylab="Scaled counts",type="o",pch=1,col=clusters.colors[j],cex.lab=1,cex.axis=0.7)
}
else{
plot(x=time.vector,y=mean.profile.A,ylim=c(0,1),main="",xlab="Time",ylab="Scaled counts",type="o",pch=1,col=clusters.colors[j],cex.lab=1,cex.axis=0.7)
}
text(current.state,x=20,y=0.1)
mean.profile.B<-colMeans(repB.scaled[gene.clusters[[j]],])
if(j==1)
{
plot(x=time.vector,y=mean.profile.B,ylim=c(0,1),main="Replicate B",xlab="Time",ylab="Scaled counts",type="o",pch=1,col=clusters.colors[j],cex.lab=1,cex.axis=0.7)
}
else
{
plot(x=time.vector,y=mean.profile.B,ylim=c(0,1),main="",xlab="Time",ylab="Scaled counts",type="o",pch=1,col=clusters.colors[j],cex.lab=1,cex.axis=0.7)
}
text(current.state,x=20,y=0.1)
mean.profile.C<-colMeans(repC.scaled[gene.clusters[[j]],])
if(j==1)
{
plot(x=time.vector,y=mean.profile.C,ylim=c(0,1),main="Replicate C",xlab="Time",ylab="Scaled counts",type="o",pch=1,col=clusters.colors[j],cex.lab=1,cex.axis=0.7)
}
else{
plot(x=time.vector,y=mean.profile.C,ylim=c(0,1),main="",xlab="Time",ylab="Scaled counts",type="o",pch=1,col=clusters.colors[j],cex.lab=1,cex.axis=0.7)
}
text(current.state,x=20,y=0.1)
}

3f. Create draft of Fig 2C-F
stateLabels=c(rep(1,2),rep(2,6),5,5,3,5,rep(3,4),rep(4,3),rep(5,3),rep(6,4))
#Data from Kavvas et al. (2018)
metabolic.pathways.tab<-read.csv("metabolic_pathways.csv",header=TRUE)
metabolic.genes<-unique(scan(file="mtb_metabolic_model_genes.txt",what="characters"))
Read 6179 items
#Filter out non-coding elements
metabolic.genes<-metabolic.genes[grep("Rv",metabolic.genes)]
#Create table with the pathways relevant to each gene
gene.path<-c()
for(t in metabolic.genes)
{
pos.path<-grep(t,metabolic.pathways.tab$Gene.Reaction.Rule)
relevant.path<-unique(as.character(metabolic.pathways.tab$Subsystem[pos.path]))
temp.vec<-cbind(rep(t,length(relevant.path)),relevant.path)
gene.path<-rbind(gene.path,temp.vec)
}
all.pathways<-unique(gene.path[,2])
#Pathways shown in Fig 2
selected.pathways<-all.pathways[c(35,41,7,47)]
par(mfrow=c(2,2))
for(z in selected.pathways)
{
genes.in.path<-intersect(DEGs.timecourse,unique(gene.path[which(gene.path[,2]==z),1]))
#Compute fold-change respect to mean normoxia transcript levels (T0 & T1)
foldchanges<-median.normalized.counts[genes.in.path,3:26]-rowMeans(median.normalized.counts[genes.in.path,time.points.clusters$Normoxia])
boxplot(foldchanges,main=paste(z," (",length(genes.in.path),")",sep=""),names=round(time.vector,digits = 1)[-1*(1:2)],las=2,ylab="Log2 fold-change",outline=F,cex.lab=1.5,col=clusters.colors[stateLabels[-1*c(1:2)]])
}

3g. Create Fig S3
#The five type seven secretion systems (T7SS) of MTB
ESX1<-c("Rv3868", "Rv3869", "Rv3877", "Rv3870", "Rv3871", "Rv3882c", "Rv3883c")
ESX2<-c("Rv3884c", "Rv3885c", "Rv3887c", "Rv3894c", "Rv3895c", "Rv3889c")
ESX3<-c("Rv0282", "Rv0283", "Rv0284", "Rv0290", "Rv0292", "Rv0289")
ESX4<-c("Rv3450c", "Rv3447c", "Rv3448")
ESX5<-c("Rv1782", "Rv1783", "Rv1795", "Rv1797", "Rv1798")
par(mfrow=c(2,2))
for(i in 1:5)
{
genes<-eval(parse(text=paste("ESX",i,sep="")))
esx.colors<-rainbow(length(genes))
plot(x=time.vector,y=scaled.normalized.median.counts[genes[1],],xlab="Time (h)",col=esx.colors[1],ylab="Scaled Normalized counts",type="o",main=paste("ESX",i,sep=""),pch=16,ylim=c(0,1))
for(d in 2:length(genes))
{
points(x=time.vector,y=scaled.normalized.median.counts[genes[d],],type="o",col=esx.colors[d],pch=16)
}
}


- Create heatmap with transcriptional profile of differentially expressed (DE) TFs (Fig 3B)
#Load TF over-expression data (Rustad et al 2014)
mtb.exp<-read.table("tfoe_data.txt",header=T,row.names = 1)
de.tfs<-intersect(colnames(mtb.exp),DEGs.timecourse)
de.tfs.ordered<-c()
tfs.colors<-rep("NA",length(de.tfs))
names(tfs.colors)<-de.tfs
for(i in transcriptional.states)
{
de.tfs.current.state<-as.character(intersect(de.tfs,gene.clusters[[i]]))
de.tfs.current.state<-de.tfs.current.state[order(de.tfs.current.state,decreasing = F)]
de.tfs.ordered<-c(de.tfs.ordered,de.tfs.current.state)
tfs.colors[de.tfs.current.state]<-rep(clusters.colors[i],length(de.tfs.current.state))
}
colors <- colorRampPalette(rev(brewer.pal(9, "YlGnBu")) )(99)
heatmap.2(median.normalized.counts[de.tfs.ordered,],Rowv = "NA",dendrogram = "none",Colv="NA",trace = "none",scale="row",col = colors,density.info="none",cexRow=0.4,main="Fig 3B",colRow = tfs.colors[de.tfs.ordered])

- Compare fold-change of DEGs in Rv0081 KO MTB (hypoxia) versus their fold-change in the Rv0081 over-expressing MTB (normoxia)
#Load ChIP-seq data (Minch et al. 2015)
mtb.chip<-read.csv("final_chipseq_binding.csv",header=T)
#Define Rv0081 regulon
Rv0081.regulon<-mtb.chip[which(mtb.chip[,1]=="Rv0081"),2]
#Load Rv0081 KO data (Sun et al. 2018)
rv0081.deg.ko<-read.csv("Rv0081_ko_hypoxia.csv",header=T,row.names=1)
#Fix locus names issue (not all of them have a locus name)
names.to.change<-rownames(rv0081.deg.ko)[grep("Rv",rownames(rv0081.deg.ko),invert=T)]
#Use MicrobesOnline data to convert gene names to loci names
mtb.names.tab1<-read.table("mtb_microbesonline.txt",fill=T,header=T)
mtb.names.tab1<-unique.matrix(mtb.names.tab1)
new.names<-c()
for(h in names.to.change)
{
pos.g<-which(mtb.names.tab1[,2]==h)
if(length(pos.g)>0)
{
new.names<-c(new.names,as.character(mtb.names.tab1[pos.g,1]))
}
else
{
new.names<-c(new.names,h)
}
}
rownames(rv0081.deg.ko)[rownames(rv0081.deg.ko) %in% names.to.change]<-new.names
#Same idea with information from the MTB Network portal
mtb.names.tab2<-read.table("mtb_portal_genes2111.txt",fill=T)
mtb.names.tab2<-unique.matrix(mtb.names.tab2)
names.to.change<-rownames(rv0081.deg.ko)[grep("Rv",rownames(rv0081.deg.ko),invert=T)]
new.names<-c()
for(h in names.to.change)
{
pos.g<-which(mtb.names.tab2[,2]==h)
if(length(pos.g)>0)
{
new.names<-c(new.names,as.character(mtb.names.tab2[pos.g,1]))
}
else
{
new.names<-c(new.names,h)
}
}
rownames(rv0081.deg.ko)[rownames(rv0081.deg.ko) %in% names.to.change]<-new.names
#Rv0081 targets that were DE in Rv0081 KO
rv0081.regulon.de<-intersect(rownames(rv0081.deg.ko),Rv0081.regulon)
#mtb.exp contains the data for Rv0081 over-expression
#First plot Rv0081 targets (according to ChIP-seq data) that were DE in the Rv0081 KO
plot(rv0081.deg.ko[rv0081.regulon.de,"Log2.._Rv0081.WT."],mtb.exp[rv0081.regulon.de,"Rv0081"],main="Fig S4A",xlab="Log2 fold-change Rv0081 KO (Hypoxia)",ylab="Log2 fold-change Rv0081 over-expression (Normoxia)",col="red",pch=1,ylim=c(-3,5.5),xlim=c(-9.5,3.5))
other.de.genes.in.rv0081.ko<-intersect(rownames(mtb.exp),setdiff(rownames(rv0081.deg.ko),rv0081.regulon.de))
# Add genes that are not Rv0081 targets (according to ChIP-seq data)
points(rv0081.deg.ko[other.de.genes.in.rv0081.ko,"Log2.._Rv0081.WT."],mtb.exp[other.de.genes.in.rv0081.ko,"Rv0081"],col="dark green",pch=1)
abline(lty=2,col="grey",h=1)
abline(lty=2,col="grey",h=-1)
abline(lty=2,col="grey",v=1)
abline(lty=2,col="grey",v=-1)
legend("topleft",legend=c("Rv0081 target (ChIP-seq)","Not Rv0081 target (ChIP-seq)"),col=c("red","dark green"),pt.cex=1,bty="n",cex=0.65,pch=1)

6a. Create Fig 4A-B
top.ffl<-list(c("Rv0081","Rv3249c"),c("Rv0081","Rv0324"))
#Rv0081-Rv3249c FFL
tf.pair<-top.ffl[[1]]
par(mfrow=c(1,3))
#Depletion-associated genes
depletion.genes<-gene.clusters[["Depletion"]]
not.DEGs<-setdiff(rownames(median.normalized.counts),DEGs.timecourse)
main.tf.regulon<-mtb.chip[which(mtb.chip[,1]==tf.pair[1]),2]
secondary.tf.regulon<-mtb.chip[which(mtb.chip[,1]==tf.pair[2]),2]
ffl.targets<-intersect(main.tf.regulon,secondary.tf.regulon)
depletion.genes.not.bound.by.main.tf<-setdiff(depletion.genes,main.tf.regulon)
depletion.genes.bound.by.main.tf.not.second<-setdiff(intersect(depletion.genes,main.tf.regulon),ffl.targets)
depletion.genes.bound.by.ffl<-intersect(depletion.genes,ffl.targets)
hist(rv0081.deg.ko[intersect(depletion.genes,rownames(rv0081.deg.ko)),"Log2.._Rv0081.WT."],col="cadetblue2",main="Fig 4A",
xlab="Log2 fold-change (Rv0081 KO vs WT)")
boxplot(mtb.exp[not.DEGs,"Rv0081"],mtb.exp[depletion.genes.not.bound.by.main.tf,"Rv0081"],mtb.exp[depletion.genes.bound.by.main.tf.not.second,"Rv0081"],mtb.exp[depletion.genes.bound.by.ffl,"Rv0081"],las=2,names=c("A(1467)","B(352)","C(62)","D(32)"),col=c("grey",rep("cadetblue2",3)),ylab="Log2 fold-change (Rv0081 TFOE vs WT)",cex.axis=1,cex.lab=1,main="Fig 4B")
plot.new()
legend("topleft",legend=c("A:not DE in time course","B:not controlled by Rv0081","C:controlled by Rv0081 but not Rv3249c ","D:controlled by Rv0081-Rv3249c FFL"),col="white",pt.cex=1,bty="n",cex=0.74,pch=1)

6b. Create Fig 4C-D
#Rv0081-Rv0324
tf.pair<-top.ffl[[2]]
#Late hypoxia-associated genes
late.genes<-gene.clusters[["Late"]]
main.tf.regulon<-mtb.chip[which(mtb.chip[,1]==tf.pair[1]),2]
secondary.tf.regulon<-mtb.chip[which(mtb.chip[,1]==tf.pair[2]),2]
ffl.targets<-intersect(main.tf.regulon,secondary.tf.regulon)
late.genes.not.bound.by.main.tf<-setdiff(late.genes,main.tf.regulon)
late.genes.bound.by.main.tf.not.second<-setdiff(intersect(late.genes,main.tf.regulon),ffl.targets)
late.genes.bound.by.ffl<-intersect(late.genes,ffl.targets)
par(mfrow=c(1,3))
hist(rv0081.deg.ko[intersect(late.genes,rownames(rv0081.deg.ko)),"Log2.._Rv0081.WT."],col="peachpuff",main="Fig 4C",
xlab="Log2 fold-change (Rv0081 KO vs WT)")
boxplot(mtb.exp[not.DEGs,"Rv0081"],mtb.exp[late.genes.not.bound.by.main.tf,"Rv0081"],mtb.exp[late.genes.bound.by.main.tf.not.second,"Rv0081"],mtb.exp[late.genes.bound.by.ffl,"Rv0081"],names=c("A(1467)","B(818)","C(99)","D(61)"),las=2,col=c("grey",rep("peachpuff",3)),ylab="Log2 fold-change (Rv0081 TFOE vs WT)",cex.axis=1,cex.lab=1,main="Fig 4D")
plot.new()
legend("topleft",legend=c("A:not DE in time course","B:not controlled by Rv0081","C:controlled by Rv0081 but not Rv0324 ","D:controlled by Rv0081-Rv0324 FFL"),col="white",pt.cex=1,bty="n",cex=0.74,pch=1)

- Generate input files for MotifNet server (FFL detection) #MotifNet run is temporarily available in: http://netbio.bgu.ac.il/motifnet/#/explorer/1414
input.network<-c()
for(k in 1:nrow(mtb.chip))
{
current.tf<-mtb.chip[k,1]
current.target<-mtb.chip[k,2]
if(current.tf %in% DEGs.timecourse & current.target %in% DEGs.timecourse )
{
input.network<-rbind(input.network,mtb.chip[k,])
}
}
input.nodes<-union(input.network[,1],input.network[,2])
#Write.files
write.table(file="input_network.txt",input.network,quote = F,row.names = F,sep = "\t")
write.table(file="input_nodes.txt",input.nodes,quote = F,row.names = F)
8a. Evaluate overlap between TF regulons and the identified transcriptional states - Part I
regulons.enriched.with.own.state<-c()
for(z in de.tfs)
{
current.regulon<-intersect(mtb.chip[which(mtb.chip[,1]==z),2],DEGs.timecourse)
#Define the state of each DE TF
#Default value is Normoxia
tf.state<-1
for(x in 2:6)
{
if(length(grep(z,gene.clusters[[x]]))>0)
{
tf.state<-x
}
}
members.current.regulon.in.TF.state<-intersect(current.regulon,gene.clusters[[tf.state]])
regulons.enriched.with.own.state.pvalue<-phyper(length(members.current.regulon.in.TF.state)-1,length(gene.clusters[[tf.state]]),length(DEGs.timecourse)-length(gene.clusters[[tf.state]]),length(current.regulon),lower.tail = F)
if(regulons.enriched.with.own.state.pvalue <= 0.05)
{
regulons.enriched.with.own.state<-c(regulons.enriched.with.own.state,z)
}
}
print(paste("There are",length(regulons.enriched.with.own.state), "regulons enriched with genes assigned to the same transcriptional state as the regulating TF",sep=" "))
[1] "There are 21 regulons enriched with genes assigned to the same transcriptional state as the regulating TF"
#Permutation test to evaluate significance of enrichment instances
#Number of permutations
N=1000
count.regulons.randomly.enriched.with.same.state<-c()
for(w in 1:N)
{
#Shuffled DEGs state-membership
shuffled.degs<-DEGs.timecourse[sample(1:length(DEGs.timecourse),length(DEGs.timecourse))]
random.states<-list()
for(i in 1:6)
{
random.states[[i]]<-shuffled.degs[1:length(gene.clusters[[i]])]
shuffled.degs<-shuffled.degs[-1*(1:length(random.states[[i]]))]
}
#Evaluate random enrichment
regulons.randomly.enriched.with.own.state<-c()
for(z in de.tfs)
{
current.regulon<-intersect(mtb.chip[which(mtb.chip[,1]==z),2],DEGs.timecourse)
#Define the state of each DE TF
#Default value is Normoxia
tf.state<-1
for(x in 2:6)
{
if(length(grep(z,random.states[[x]]))>0)
{
tf.state<-x
}
}
members.current.regulon.in.random.TF.state<-intersect(current.regulon,random.states[[tf.state]])
regulons.randomly.enriched.with.own.state.pvalue<-phyper(length(members.current.regulon.in.random.TF.state)-1,length(random.states[[tf.state]]),length(DEGs.timecourse)-length(random.states[[tf.state]]),length(current.regulon),lower.tail = F)
if(regulons.randomly.enriched.with.own.state.pvalue <= 0.05)
{
regulons.randomly.enriched.with.own.state<-c(regulons.randomly.enriched.with.own.state,z)
}
}
count.regulons.randomly.enriched.with.same.state<-c(count.regulons.randomly.enriched.with.same.state,length(regulons.randomly.enriched.with.own.state))
}
#Permutation p-value
print(length(which(count.regulons.randomly.enriched.with.same.state >= length(regulons.enriched.with.own.state))))
[1] 0
8b. Evaluate overlap between TF regulons and the identified transcriptional states - Part II
regulons.enriched.with.other.state<-c()
for(z in de.tfs)
{
current.regulon<-intersect(mtb.chip[which(mtb.chip[,1]==z),2],DEGs.timecourse)
#Define the state of each DE TF
#Default value is Normoxia
tf.state<-1
for(x in 2:6)
{
if(length(grep(z,gene.clusters[[x]]))>0)
{
tf.state<-x
}
}
for(q in setdiff(1:6,tf.state))
{
members.current.regulon.in.other.state<-intersect(current.regulon,gene.clusters[[q]])
enrichment.with.other.state.pvalue<-phyper(length(members.current.regulon.in.other.state)-1,length(gene.clusters[[q]]),length(DEGs.timecourse)-length(gene.clusters[[q]]),length(current.regulon),lower.tail = F)
if(enrichment.with.other.state.pvalue <= 0.05)
{
regulons.enriched.with.other.state<-c(regulons.enriched.with.other.state,z)
}
}
}
print(paste("There are",length(unique(regulons.enriched.with.other.state)), "regulons enriched with genes assigned a transcriptional state different from the one of the regulating TF",sep=" "))
[1] "There are 49 regulons enriched with genes assigned a transcriptional state different from the one of the regulating TF"
---
title: "R code for computational analyses reported in Intricate Genetic Programs Controlling Dormancy in Mycobacterium tuberculosis"
output: html_notebook
author: Mario Arrieta-Ortiz et al. (Institute for Systems Biology, Baliga Lab)
date: October 30, 2019
---
0. Load required libraries
```{r message=FALSE}
library(Boruta)
library(clValid)
library(DESeq2)
library(factoextra)
library(gplots)
library(multtest)
library(NbClust) 
library(pvclust)
#library(quantro) #This package was used in R-3.6 version 
library(RColorBrewer)
library(rgl)
library(Rtsne)
library(scatterplot3d)
library(viridis)
```
1. Run differential expression analysis using DESeq2 (as suggested in DESeq2's vignette; http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html)
```{r message=FALSE}
#Load raw read counts (determined with DuffyNGS)
#All time points have three replicates but T15 (which has six replicates)
counts.hypoxia<-as.matrix(read.csv("hypoxia_raw_reads.csv",row.names=1))
#Replace few NA values with zeroes (0.049%)
na.hypoxia<-which(is.na(counts.hypoxia))
counts.hypoxia[na.hypoxia]<-0
#Print time points/replicates in the expression matrix
print(colnames(counts.hypoxia))
#Create list with differentially expressed genes (DEGs) at each time point 
de.genes<-list()
#Save names of all DEGs
DEGs.timecourse<-c()
#DESeq2 analysis is performed for each time point with respect to T0
for(i in c(2:16,18:27))
{
  #For time points different to T15
  if(i!=16)
  {
    count.tab<-counts.hypoxia[,c(1:3,((3*i)-2):(3*i))]
    #Round read counts (determined by DuffyNGS)
    count.tab<-round(count.tab)
    #Metadata (triplicates)
    exp.metadata <- data.frame(condition = factor(rep(c("control", "tpoint"), c(3,3))))
    rownames(exp.metadata) <- colnames(count.tab)
    dds <- DESeqDataSetFromMatrix(count.tab, exp.metadata, ~condition)
    #Remove genes with zero reads in all samples
    dds <- dds[ rowSums(counts(dds)) > 1, ]
    #Run main function
    dds <- DESeq(dds)
    #Output
    output.deseq <- results(dds)
    #Save names of DEGs
    degs<-rownames(output.deseq)[which(output.deseq$padj < 0.05 & abs(output.deseq$log2FoldChange)>1)]
    de.genes[[i]]<-degs
    DEGs.timecourse<-union(DEGs.timecourse,degs)
  }
  #For T15
  else
  {
    count.tab<-counts.hypoxia[,c(1:3,((3*i)-2):((3*i)+3))]
    #Round read counts
    count.tab<-round(count.tab)
    #Metadata
    exp.metadata <- data.frame(condition = factor(rep(c("control", "tpoint"), c(3,6))))
    rownames(exp.metadata) <- colnames(count.tab)
    dds <- DESeqDataSetFromMatrix(count.tab, exp.metadata, ~condition)
    #Remove genes with zero reads in all samples
    dds <- dds[ rowSums(counts(dds)) > 1, ]
    #Run main function
    dds <- DESeq(dds)
    #Output
    output.deseq <- results(dds)
    #Save names of DEGs
    degs<-rownames(output.deseq)[which(output.deseq$padj < 0.05 & abs(output.deseq$log2FoldChange)>1)]
    de.genes[[i]]<-degs
    DEGs.timecourse<-union(DEGs.timecourse,degs)
  }
}
```
2a. Normalized full expression matrix and perform hierarchichal clustering on median profiles
```{r}
#Normalize read counts using all time points and replicates
count.tab<-round(counts.hypoxia)
#Metadata
exp.metadata <- data.frame(condition = factor(sapply(1:ncol(counts.hypoxia),function(x){strsplit(colnames(counts.hypoxia)[x],split = "_")[[1]][1]})))
rownames(exp.metadata) <- colnames(count.tab)
#Estimate normalized counts with DESeq2 (log transformation and normal distribution)
dds <- DESeqDataSetFromMatrix(count.tab, exp.metadata, ~condition)
dds <- estimateSizeFactors(dds)
normalized.counts <- assay(normTransform(dds))
#Estimate median counts for each gene per time point
time.points<-paste(unique(exp.metadata$condition),"_",sep="")
median.normalized.counts<-c()
for(z in time.points)
{
	col.pos<-grep(z,colnames(normalized.counts))
	median.normalized.counts<-cbind(median.normalized.counts,apply(FUN=median,MARGIN=1,normalized.counts[,col.pos]))
}
colnames(median.normalized.counts)<-paste("T",0:25,sep="")
#Hierarchical clustering with bootstrap values
hierarchical.clustering.euclidean<-pvclust(data = median.normalized.counts,method.dist ="euclidean")
plot(hierarchical.clustering.euclidean, print.pv=TRUE, print.num=FALSE, float=0.01,
     col.pv=c(2,3,8), cex.pv=0.8,cex=0.8,main="Fig S1B")
```
2b. Apply PCA to the full normalized expression matrix
```{r}
#The average of the two samples from each reactor was used for defining T15
normalized.counts.t15_average<-cbind(normalized.counts[,1:45],rowMeans(normalized.counts[,c(46,49)]),
                               rowMeans(normalized.counts[,c(47,50)]),
                               rowMeans(normalized.counts[,c(48,51)]), normalized.counts[,52:81])
colnames(normalized.counts.t15_average)[46:48]<-c("T15_A","T15_B","T15_C")
#Run PCA
pca.input<-t(normalized.counts.t15_average)
pca.output<-prcomp(pca.input)
#Plot 3D PCA (for the final Figure S1A we used the interactive plot3d function)
#clusters.colors<-c("blue","deepskyblue2","orange","darkolivegreen1","green3","red")
#stateLabels=c(rep(1,2),rep(2,6),3,3,4,3,rep(4,4),rep(5,3),rep(3,3),rep(6,4))
#gr <- grid3d('z')
#plot3d(pca.output$x[,1:3],col=rep(clusters.colors[stateLabels],each=3),size = 10)
#axes3d(labels=F,tick=F,box=F)
#text3d(pca.output$x[,1:3]+1,texts=rownames(pca.input),cex=1.1)
pca.plot<-scatterplot3d(x = pca.output$x[,1], y = pca.output$x[,3], z = pca.output$x[,2],
               pch = 19,angle = -190,xlab = "PC1",ylab="PC3",zlab="PC2",color=rep(viridis(26),each=3),main="Fig S1A")
legend(pca.plot$xyz.convert(20, -40, 30), pch=19, col=viridis(26), legend = unique(exp.metadata$condition), bty="o", cex=.35)
```
2c. Apply clustering algorithms (hierarchichal clustering, k-means) to the full normalized expression matrix
```{r}
#Hierarchical clustering with bootstrap values (all replicates)
hierarchical.clustering.euclidean.reps<-pvclust(data = normalized.counts.t15_average,method.dist ="euclidean")
plot(hierarchical.clustering.euclidean.reps, print.pv=TRUE, print.num=FALSE, float=0.01,
     col.pv=c(2,3,8), cex.pv=0.4,cex=0.5,main="Fig S1C")
#K-means with k=2
k.means.2<-kmeans(t(normalized.counts.t15_average),2,nstart = 25)
#Print one of the two k-means clusters
print(names(which(k.means.2$cluster == 1)))
#K-means with k=6
k.means.6<-kmeans(t(normalized.counts.t15_average),6,nstart = 25)
#Print replicates clusters (related to Fig S1D)
print(k.means.6$cluster)
#Compute silhouette clustering metric using clValid package 
clValid.internal.metrics <- clValid(t(normalized.counts.t15_average), nClust = 2:26,clMethods = c("hierarchical","kmeans","diana","pam","clara"), validation = "internal")
#Silhouette values were extracted using the summary(clValid.internal.metrics) command
silhouette.kmeans<-c(0,0.4706,0.2948,0.3043,0.3666,0.3814,0.3553,0.3432,0.3163,0.3136,0.2949,0.2902,   
              0.2710,0.2555,0.2506,0.2498,0.2303,0.2317,0.2283,0.2277,0.2310,0.2275,0.2250,
              0.2216,0.2200,0.2155)
plot(x=1:26,y=silhouette.kmeans,lwd=1.2,type="o",pch=20,xlab="Number of clusters k",ylab="Average silhouette width",cex.lab=1.5,cex=1.5,col="purple",cex.axis=1.5, main="Fig S1E")
```
2d. Ensemble clustering- Part I
```{r}
#First we perform ensemble clustering for k-means (with 1< k < 11)
#Number of bootstraps
N=5000
#Create co-clustering matrix
clustering.matrix<-matrix(ncol=26,nrow=26,0)
rownames(clustering.matrix)<-unique(sapply(1:ncol(normalized.counts.t15_average),function(x){strsplit(colnames(normalized.counts.t15_average)[x],split = "_")[[1]][1]}))
colnames(clustering.matrix)<-rownames(clustering.matrix)
for(n in 1:N)
{
  #New time series with replicates from different reactors (one random replicate per time point)
  permutation.sampling<-c()
  for(i in 0:25)
  {
  #Location of current time point replicates in the normalized expression matrix
  potential.point<- (i*3) + (1:3)
  #Random selection of single replicate
  selected.point<-potential.point[sample(1:3,1)]
  #Add selected replicate to time series
  permutation.sampling<-cbind(permutation.sampling,normalized.counts.t15_average[,selected.point])
  }
  colnames(permutation.sampling)<-colnames(clustering.matrix)
  #Compute k-means with k in the [2,10] range
  for(z in 2:10)
  {
  temp.kmeans.z<-kmeans(t(permutation.sampling),z,nstart = 25)
  #Identify time points that clustered together
  for(a in 1:z){
  clustered.points<-names(which( temp.kmeans.z$cluster == a))
  if(length(clustered.points)>1)
  {
  for(d in 1:length(clustered.points))
  {
    for(c in 1:length(clustered.points))
    {
      clustering.matrix[clustered.points[d],clustered.points[c]]<- clustering.matrix[clustered.points[d],clustered.points[c]]+1
    }
  }
 }
 else
 {
  clustering.matrix[clustered.points,clustered.points]<- clustering.matrix[clustered.points,clustered.points]+1
 }
 			}
}

}
heatmap.2(clustering.matrix/(N*9),Rowv = "NA",dendrogram = "none",Colv="NA",trace = "none",col = colorRampPalette(rev(brewer.pal(20, "PiYG")) )(20),density.info="none",main="Fig S1F")
```

2e. Ensemble clustering- Part II
```{r}
#Now that the existence of six clusters has been confirmed. We perform ensemble clustering for k-means (k=6)
#Number of bootstraps
N=10000
#Create co-clustering matrix
clustering.matrix.200<-matrix(ncol=26,nrow=26,0)
rownames(clustering.matrix.200)<-unique(sapply(1:ncol(normalized.counts.t15_average),function(x){strsplit(colnames(normalized.counts.t15_average)[x],split = "_")[[1]][1]}))
colnames(clustering.matrix.200)<-rownames(clustering.matrix.200)
for(n in 1:N)
{
  #New time series with replicates from different reactors (one random replicate per time point)
  permutation.sampling<-c()
  for(i in 0:25)
  {
  #Location of current time point replicates in the normalized expression matrix
  potential.point<- (i*3) + (1:3)
  #Random selection of single replicate
  selected.point<-potential.point[sample(1:3,1)]
  #Add selected replicate to time series
  permutation.sampling<-cbind(permutation.sampling,normalized.counts.t15_average[,selected.point])
  }
  colnames(permutation.sampling)<-colnames(clustering.matrix.200)
  #Run k-means with k=6
  temp.kmeans.6<-kmeans(t(permutation.sampling),6,nstart = 25)
  #Identify time points that clustered together
  for(z in 1:6)
  {
  clustered.points<-names(which(temp.kmeans.6$cluster == z))
  if(length(clustered.points)>1)
  {
  for(d in 1:length(clustered.points))
  {
    for(c in 1:length(clustered.points))
    {
      clustering.matrix.200[clustered.points[d],clustered.points[c]]<- clustering.matrix.200[clustered.points[d],clustered.points[c]]+1
    }
  }
 }
 else
 {
  clustering.matrix.200[clustered.points,clustered.points]<- clustering.matrix.200[clustered.points,clustered.points]+1
 }
}

}
clustering.ensemble.200k<-clustering.matrix.200/N 
heatmap.2(clustering.ensemble.200k,Rowv = "NA",dendrogram = "none",Colv="NA",trace = "none",col = colorRampPalette(rev(brewer.pal(20, "PiYG")) )(20),density.info="none",main="Fig S1G")
```
2f. Compare inter- and intra-group variation using quantro (original run in R-3.6)
```{r}
#quantro.stateLabels=c(rep("N",6),rep("D",18),rep("L",6),rep("E",3),rep("L",3),rep("E",12),rep("M",9),rep("L",9),rep("R",12))
#Remove genes with zero counts in all replicates
#zero.rows<-which(apply(FUN=max,MARGIN=1,normalized.counts.t15_average)==0)
#normalized.counts.reps<-normalized.counts.t15_average[-1*zero.rows,]
#Run Quantro with 10000 bootstraps
#qtest.normalized <- quantro(object = normalized.counts.reps, groupFactor = quantro.stateLabels,B = 10000)
#Plot result
#quantroPlot(qtest.normalized)
```

2g. Visualize clusters (states) with tSNE plot
```{r message=FALSE}
#Run tSNE (using only DEGs to have best resolution)
clusters.colors<-c("blue","deepskyblue2","orange","darkolivegreen1","green3","red")
stateLabels=c(rep(1,2),rep(2,6),3,3,4,3,rep(4,4),rep(5,3),rep(3,3),rep(6,4))
tsne.input<-t(median.normalized.counts[DEGs.timecourse,])
tsne.output=Rtsne(tsne.input,dims=2,perplexity=7,verbose=TRUE)
plot(tsne.output$Y,col=clusters.colors[stateLabels],pch=19,xlab='tSNE Component 1',ylab='tSNE Component 2',main="Fig 2A")
text(x=tsne.output$Y[,1],y=tsne.output$Y[,2],rownames(tsne.input))
```
3a. Define gene clusters based on clusters-average expression profile
```{r}
#The six states defined in Fig 2A
transcriptional.states<-c("Normoxia","Depletion","Early","Mid","Late","Resuscitation")
#Time points-states association
time.points.clusters<-list(c(1,2),3:8,c(11,13:16),17:19,c(9,10,12,20:22),23:26)
names(time.points.clusters)<-transcriptional.states
gene.clusters<-rep(list(NULL),length(transcriptional.states))
#Assign each DEG to one transcriptional state
for(f in DEGs.timecourse)
{
	#Initialize maximum average expression and state membership
  max.level<-mean(median.normalized.counts[f,time.points.clusters[[1]]])
	gene.state<-1
	#Evaluate if the average is higher in any of the other states
	for (s in 2:6)
	{
	  temp.mean<-mean(median.normalized.counts[f,time.points.clusters[[s]]])
		#Update state and maximum average
	  if (temp.mean > max.level)
		{
			gene.state <- s
			max.level<-temp.mean
		}
	}	
	gene.clusters[[gene.state]]<-c(gene.clusters[[gene.state]],f)
}
names(gene.clusters)<-transcriptional.states
#This function computes bicluster residual using formula from Cheng and Church, 2000
bc.residual<-function(genes,conditions,exp.mat)
{
	volume<-length(genes)*length(conditions)
	D_IJ<-mean(exp.mat[genes,conditions])
	residual<-0
	for(f in genes)
	{
		for(z in conditions)
		{
			temp.r<-exp.mat[f,z] - mean(exp.mat[f,conditions]) - mean(exp.mat[genes,z]) + D_IJ
			residual<- residual + (temp.r)^2
		}
	}
	residual<- residual/volume
	residual
}
#This function computes the average Pearson correlation of a group of genes
cluster.correlation<-function(genes)
{
  all.correlations<-c()
  for(w in 1:(length(genes)-1))
  {
    for(t in (w+1):length(genes))
    {
      all.correlations<-c(all.correlations,cor(median.normalized.counts[genes[w],],median.normalized.counts[genes[t],]))
    }
  }
  output<-mean(all.correlations)
  output
}
```
3b. Comparison with Boruta clustering (random forest based approach)
```{r message=FALSE}
#Each state vs the other five states
bestPartition<-c(0,0,1,1,1,1,1,1,4,4,2,4,2,2,2,2,3,3,3,4,4,4,5,5,5,5)
boruta.signature.genes<-list()
for(z in 0:5)
{
hypoxia.df<-as.data.frame(cbind(bestPartition,t(median.normalized.counts[DEGs.timecourse,])))
hypoxia.df$bestPartition[which(hypoxia.df$bestPartition != z)]<- 100
hypoxia.df$bestPartition<-as.factor(hypoxia.df$bestPartition)
set.seed(123)
boruta.train <- Boruta(bestPartition~., data = hypoxia.df, doTrace = 2)
final.boruta <- TentativeRoughFix(boruta.train)
print(final.boruta)
boruta.signature.genes[[z+1]]<-names(final.boruta$finalDecision[which(final.boruta$finalDecision == "Confirmed")])
}
names(boruta.signature.genes)<-c("Normoxia","Depletion","Early","Mid","Late","Resuscitation")
#Now we create Table S3
tableS3<-matrix(nrow=length(gene.clusters),ncol=6,NA,dimnames = list(transcriptional.states,c("Genes selected based on mean", "Mean Square Residual","Pearson correlation", "# Boruta genes","Overlap", "Recall-Boruta")))
for(z in rownames(tableS3))
{
  current.state.genes<-gene.clusters[[z]]
  tableS3[z,1]<-length(current.state.genes)
  tableS3[z,2]<-bc.residual(genes = current.state.genes,conditions =time.points.clusters[[z]] ,exp.mat = median.normalized.counts )
  tableS3[z,3]<-cluster.correlation(current.state.genes)
  tableS3[z,4]<-length(boruta.signature.genes[[z]])
  tableS3[z,5]<-length(intersect(boruta.signature.genes[[z]],current.state.genes))
  tableS3[z,6]<-round(tableS3[z,5]/tableS3[z,4],digits = 2)
}
```
Print Table S3
```{r}
print(tableS3)
```

3c. Compare the identified transcriptional states with previous hypoxia models (Table S1)
```{r message=FALSE}
#Initial hypoxic response
IHR<-scan("IHR_genes.csv",what="character")
#Enduring hypoxic response
EHR<-scan("EHR_genes.csv",what="character")
NRP<-read.csv("nrp_data.csv",header=T,row.names=1)
#Define Non replicating persistence 1 genes
NRP1<-rownames(NRP)[which(NRP$pval1 <= 0.05 & NRP$Ratio1 > 2)]
#Define Non replicating persistence 2 genes
NRP2<-rownames(NRP)[which(NRP$pval2 <= 0.05 & NRP$Ratio2 > 2)]
#Comparison matrix
others.hypoxia.models<-c("IHR","EHR","NRP1","NRP2")
tableS1<-matrix(ncol=8,nrow=6,NA,dimnames = list(transcriptional.states,c("IHR recall","IHR P-value","EHR recall","EHR P-value","NRP1 recall","NRP1 P-value","NRP2 recall","NRP2 P-value")))
for(i in names(gene.clusters))
{
  for(j in others.hypoxia.models)
  {
    overlap<-intersect(gene.clusters[[i]],eval(parse(text=j)))
    tableS1[i,paste(j,"recall",sep=" ")]<-length(overlap)/length(eval(parse(text=j)))
    pval<-phyper(length(overlap)-1,length(eval(parse(text=j))),nrow(median.normalized.counts)-length(eval(parse(text=j))),length(gene.clusters[[i]]),lower.tail=F)
    tableS1[i,paste(j,"P-value",sep=" ")]<-pval
  }
}
tableS1<-round(tableS1,digits = 3)
print(tableS1)
```
3d.Define the set of differentialy expressed genes in Galagan et al. (2013) and evaluate recall of differentially expressed genes in intracellular MTB (collected with Path-seq) (Table S2)
```{r}
#Load Galagan's data (triplicates for each time point)
galagan.raw.data<-read.csv("galagan_raw_hypoxia_ts.csv",row.names=1)
#Perform differential expression analysis with bayesian T-test using Cyber-T (Baldi and Long, 2001)
source("cyberTtest.R")
#List with the output of differential expression analyses
galagan.dea<-list()
#List with DEGs at each time point
galagan.deg<-list()
galagan.DEGs.total<-c()
for(t in 1:6)
{
    galagan.dea[[t]]<-bayesT(aData = galagan.raw.data[,c(1:3,(1+(3*t)):(3+(3*t)))],numC = 3,numE = 3,conf = 7,doMulttest = T)
    galagan.deg[[t]]<-rownames(galagan.dea[[t]])[which(galagan.dea[[t]]$BH < 0.05 & abs(galagan.dea[[t]]$meanC - galagan.dea[[t]]$meanE)> 1)]
    galagan.DEGs.total<-union(galagan.DEGs.total,galagan.deg[[t]])
}
#load list of DEGs identified with Path-seq (Peterson et al, 2019)
intracellular.genes<-scan("DEGs_intracellular_MTB_pathseq.txt",what="character")
tableS2<-matrix(nrow=4,ncol=3,NA,dimnames=list(c("Controlled O2","Galagan13","Wayne","Defined"),c("Overlap w/ intracellular genes","Recall","P-value")))
tableS2[1,"Overlap w/ intracellular genes"]<-length(intersect(intracellular.genes,DEGs.timecourse))
tableS2[1,"Recall"]<-round(tableS2[1,1]/length(intracellular.genes),digits = 3)
tableS2[1,"P-value"]<-phyper(tableS2[1,1]-1,length(intersect(rownames(median.normalized.counts),intracellular.genes)),nrow(median.normalized.counts)-length(intersect(rownames(median.normalized.counts),intracellular.genes)),length(DEGs.timecourse),lower.tail = F)
tableS2[2,"Overlap w/ intracellular genes"]<-length(intersect(intracellular.genes,galagan.DEGs.total))
tableS2[2,"Recall"]<-round(tableS2[2,1]/length(intracellular.genes),digits = 3)
tableS2[2,"P-value"]<-phyper(tableS2[2,1]-1,length(intersect(rownames(galagan.raw.data),intracellular.genes)),nrow(galagan.raw.data)-length(intersect(rownames(galagan.raw.data),intracellular.genes)),length(galagan.DEGs.total),lower.tail = F)
tableS2[3,"Overlap w/ intracellular genes"]<-length(intersect(intracellular.genes,union(NRP1,NRP2)))
tableS2[3,"Recall"]<-round(tableS2[3,1]/length(intracellular.genes),digits = 3)
#There are 4,094 genes in the Path-seq expression dataset, so we take that number as the total population size in the hypergeometric test 
tableS2[3,"P-value"]<-phyper(tableS2[3,1]-1,length(intracellular.genes),4094-length(intracellular.genes),length(union(NRP1,NRP2)),lower.tail = F)
tableS2[4,"Overlap w/ intracellular genes"]<-length(intersect(intracellular.genes,union(IHR,EHR)))
tableS2[4,"Recall"]<-round(tableS2[4,1]/length(intracellular.genes),digits = 3)
tableS2[4,"P-value"]<-phyper(tableS2[4,1]-1,length(intracellular.genes),4094-length(intracellular.genes),length(union(IHR,EHR)),lower.tail = F)
print(tableS2)
```
3d. Create Fig 2B draft
```{r}
clusters.colors<-c("blue","deepskyblue2","darkolivegreen1","green3","orange","red")
names(clusters.colors)<-transcriptional.states
time.vector<-scan("time_vector.txt")
time.vector<-round(time.vector,digits = 2)
#We summarized common themes of significant functional terms defined by DAVID (Huang da et al, 2009)
DAVID.enriched.selected.terms<-as.matrix(rbind(c(52,31,17,14,10,10,17,30,36,5,36,14,23,15,8,9),c("Translation","Oxididative phos.","Aa met.","ATP synthesis","ETC","Lipid met.","Aa met.","Response to hypoxia","Transcription regulation","Histidine synthesis",
		"PPE","Cobalamin synthesis","Mammalian cell entry","ESAT-6 like*","Proteases","DNA transposition"),rep(transcriptional.states[2:6],times=c(3,3,4,3,3))))
rownames(DAVID.enriched.selected.terms)<-c("Number of genes","Term","State")
#Scale normalized read counts in the [0,1] interval
scale.counts<-function(countMatrix)
{
  output<-c()
for(q in rownames(countMatrix))
{
  min.counts<-min(countMatrix[q,])
	max.counts<-max(countMatrix[q,])
	counts.range<-max.counts - min.counts
	output<- rbind(output,(countMatrix[q,]-min.counts)/counts.range)
}
 rownames(output)<-rownames(countMatrix)
 output
}
scaled.normalized.median.counts<-scale.counts(median.normalized.counts)
rownames(scaled.normalized.median.counts)<-rownames(median.normalized.counts)
par(mfrow=c(6,3))
par(mar=c(1,1,1,1))
for(j in 1:6)
{
  current.state<-transcriptional.states[j]
  mean.profile<-colMeans(scaled.normalized.median.counts[gene.clusters[[j]],])
  plot(x=time.vector,y=mean.profile,ylim=c(0,1),main="",xlab="Time",ylab="Scaled counts",type="o",pch=1,col=clusters.colors[j],cex.lab=1,cex.axis=0.7)
  text(current.state,x=20,y=0.1)
  frame()
  if (j == 1)
  {
    frame()
  }
  else
  {
  barplot(as.numeric(DAVID.enriched.selected.terms["Number of genes",which(DAVID.enriched.selected.terms["State",]==current.state)]),cex.lab=1,
	names=DAVID.enriched.selected.terms["Term",which(DAVID.enriched.selected.terms["State",]==current.state)],col=rep(clusters.colors[j],length(which(DAVID.enriched.selected.terms["State",]==current.state))),las=2,horiz = T,xlim=c(0,50))
 }
}
```
3e. Create figure similar to Fig 2b but for each reactor
```{r}
#Define expression matrix for each replicate
repA<-normalized.counts.t15_average[,grep("A",colnames(normalized.counts.t15_average))]
colnames(repA)<-paste("T",0:25,"A",sep="")
repB<-normalized.counts.t15_average[,grep("B",colnames(normalized.counts.t15_average))]
colnames(repB)<-paste("T",0:25,"B",sep="")
repC<-normalized.counts.t15_average[,grep("C",colnames(normalized.counts.t15_average))]
colnames(repC)<-paste("T",0:25,"C",sep="")
repA.scaled<-scale.counts(repA)
repB.scaled<-scale.counts(repB)
repC.scaled<-scale.counts(repC)
par(mfrow=c(6,3))
par(mar=c(1,1,1,1))
for(j in 1:6)
{
  current.state<-transcriptional.states[j]
  mean.profile.A<-colMeans(repA.scaled[gene.clusters[[j]],])
  if(j==1)
  {
  plot(x=time.vector,y=mean.profile.A,ylim=c(0,1),main="Replicate A",xlab="Time",ylab="Scaled counts",type="o",pch=1,col=clusters.colors[j],cex.lab=1,cex.axis=0.7)
  }
  else{
    plot(x=time.vector,y=mean.profile.A,ylim=c(0,1),main="",xlab="Time",ylab="Scaled counts",type="o",pch=1,col=clusters.colors[j],cex.lab=1,cex.axis=0.7)
  }
  text(current.state,x=20,y=0.1)

  mean.profile.B<-colMeans(repB.scaled[gene.clusters[[j]],])
  if(j==1)
  {
  plot(x=time.vector,y=mean.profile.B,ylim=c(0,1),main="Replicate B",xlab="Time",ylab="Scaled counts",type="o",pch=1,col=clusters.colors[j],cex.lab=1,cex.axis=0.7)
  }
  else
  {
   plot(x=time.vector,y=mean.profile.B,ylim=c(0,1),main="",xlab="Time",ylab="Scaled counts",type="o",pch=1,col=clusters.colors[j],cex.lab=1,cex.axis=0.7) 
  }
  text(current.state,x=20,y=0.1)

  mean.profile.C<-colMeans(repC.scaled[gene.clusters[[j]],])
  if(j==1)
  {
  plot(x=time.vector,y=mean.profile.C,ylim=c(0,1),main="Replicate C",xlab="Time",ylab="Scaled counts",type="o",pch=1,col=clusters.colors[j],cex.lab=1,cex.axis=0.7)
  }
  else{
    plot(x=time.vector,y=mean.profile.C,ylim=c(0,1),main="",xlab="Time",ylab="Scaled counts",type="o",pch=1,col=clusters.colors[j],cex.lab=1,cex.axis=0.7)
  }
  text(current.state,x=20,y=0.1)
}
```
3f. Create draft of Fig 2C-F
```{r}
stateLabels=c(rep(1,2),rep(2,6),5,5,3,5,rep(3,4),rep(4,3),rep(5,3),rep(6,4))
#Data from Kavvas et al. (2018)
metabolic.pathways.tab<-read.csv("metabolic_pathways.csv",header=TRUE)
metabolic.genes<-unique(scan(file="mtb_metabolic_model_genes.txt",what="characters"))
#Filter out non-coding elements
metabolic.genes<-metabolic.genes[grep("Rv",metabolic.genes)]
#Create table with the pathways relevant to each gene
gene.path<-c()
for(t in metabolic.genes)
 {
 pos.path<-grep(t,metabolic.pathways.tab$Gene.Reaction.Rule)
 relevant.path<-unique(as.character(metabolic.pathways.tab$Subsystem[pos.path]))
 temp.vec<-cbind(rep(t,length(relevant.path)),relevant.path)
 gene.path<-rbind(gene.path,temp.vec)
 }
 all.pathways<-unique(gene.path[,2])
 #Pathways shown in Fig 2
 selected.pathways<-all.pathways[c(35,41,7,47)]
 par(mfrow=c(2,2))
 for(z in selected.pathways)
 {
 genes.in.path<-intersect(DEGs.timecourse,unique(gene.path[which(gene.path[,2]==z),1]))
 #Compute fold-change respect to mean normoxia transcript levels (T0 & T1)
 foldchanges<-median.normalized.counts[genes.in.path,3:26]-rowMeans(median.normalized.counts[genes.in.path,time.points.clusters$Normoxia])
 boxplot(foldchanges,main=paste(z," (",length(genes.in.path),")",sep=""),names=round(time.vector,digits = 1)[-1*(1:2)],las=2,ylab="Log2 fold-change",outline=F,cex.lab=1.5,col=clusters.colors[stateLabels[-1*c(1:2)]])
 }
```
3g. Create Fig S3
```{r}
#The five type seven secretion systems (T7SS) of MTB
ESX1<-c("Rv3868", "Rv3869", "Rv3877", "Rv3870", "Rv3871", "Rv3882c", "Rv3883c")
ESX2<-c("Rv3884c", "Rv3885c", "Rv3887c", "Rv3894c", "Rv3895c", "Rv3889c")
ESX3<-c("Rv0282", "Rv0283", "Rv0284", "Rv0290", "Rv0292", "Rv0289")
ESX4<-c("Rv3450c", "Rv3447c", "Rv3448")
ESX5<-c("Rv1782", "Rv1783", "Rv1795", "Rv1797", "Rv1798")
par(mfrow=c(2,2))
for(i in 1:5)
{
  genes<-eval(parse(text=paste("ESX",i,sep="")))
  esx.colors<-rainbow(length(genes))
  plot(x=time.vector,y=scaled.normalized.median.counts[genes[1],],xlab="Time (h)",col=esx.colors[1],ylab="Scaled Normalized counts",type="o",main=paste("ESX",i,sep=""),pch=16,ylim=c(0,1))
  for(d in 2:length(genes))
  {
    points(x=time.vector,y=scaled.normalized.median.counts[genes[d],],type="o",col=esx.colors[d],pch=16)
  }
}
```
4. Create heatmap with transcriptional profile of differentially expressed (DE) TFs (Fig 3B)
```{r}
#Load TF over-expression data (Rustad et al 2014) 
mtb.exp<-read.table("tfoe_data.txt",header=T,row.names = 1)
de.tfs<-intersect(colnames(mtb.exp),DEGs.timecourse)
de.tfs.ordered<-c()
tfs.colors<-rep("NA",length(de.tfs))
names(tfs.colors)<-de.tfs
for(i in transcriptional.states)
{
de.tfs.current.state<-as.character(intersect(de.tfs,gene.clusters[[i]]))
de.tfs.current.state<-de.tfs.current.state[order(de.tfs.current.state,decreasing = F)]
de.tfs.ordered<-c(de.tfs.ordered,de.tfs.current.state)
tfs.colors[de.tfs.current.state]<-rep(clusters.colors[i],length(de.tfs.current.state))
}
colors <- colorRampPalette(rev(brewer.pal(9, "YlGnBu")) )(99)
heatmap.2(median.normalized.counts[de.tfs.ordered,],Rowv = "NA",dendrogram = "none",Colv="NA",trace = "none",scale="row",col = colors,density.info="none",cexRow=0.4,main="Fig 3B",colRow = tfs.colors[de.tfs.ordered])
```
5. Compare fold-change of DEGs in Rv0081 KO MTB (hypoxia) versus their fold-change in the Rv0081 over-expressing MTB (normoxia)
```{r}
#Load ChIP-seq data (Minch et al. 2015)
mtb.chip<-read.csv("final_chipseq_binding.csv",header=T)
#Define Rv0081 regulon
Rv0081.regulon<-mtb.chip[which(mtb.chip[,1]=="Rv0081"),2]
#Load Rv0081 KO data (Sun et al. 2018)
rv0081.deg.ko<-read.csv("Rv0081_ko_hypoxia.csv",header=T,row.names=1)
#Fix locus names issue (not all of them have a locus name)
names.to.change<-rownames(rv0081.deg.ko)[grep("Rv",rownames(rv0081.deg.ko),invert=T)]
#Use MicrobesOnline data to convert gene names to loci names
mtb.names.tab1<-read.table("mtb_microbesonline.txt",fill=T,header=T)
mtb.names.tab1<-unique.matrix(mtb.names.tab1)
new.names<-c()
for(h in names.to.change)
{
 pos.g<-which(mtb.names.tab1[,2]==h)
 if(length(pos.g)>0)
 {
   new.names<-c(new.names,as.character(mtb.names.tab1[pos.g,1]))
 }
 else
  {
   new.names<-c(new.names,h)
  }
}
rownames(rv0081.deg.ko)[rownames(rv0081.deg.ko) %in% names.to.change]<-new.names
#Same idea with information from the MTB Network portal
mtb.names.tab2<-read.table("mtb_portal_genes2111.txt",fill=T)
mtb.names.tab2<-unique.matrix(mtb.names.tab2)
names.to.change<-rownames(rv0081.deg.ko)[grep("Rv",rownames(rv0081.deg.ko),invert=T)]
new.names<-c()
for(h in names.to.change)
{
 pos.g<-which(mtb.names.tab2[,2]==h)
 if(length(pos.g)>0)
 {
   new.names<-c(new.names,as.character(mtb.names.tab2[pos.g,1]))
 }
 else
 {
   new.names<-c(new.names,h)
 }
}
rownames(rv0081.deg.ko)[rownames(rv0081.deg.ko) %in% names.to.change]<-new.names
#Rv0081 targets that were DE in Rv0081 KO
rv0081.regulon.de<-intersect(rownames(rv0081.deg.ko),Rv0081.regulon)
#mtb.exp contains the data for Rv0081 over-expression
#First plot Rv0081 targets (according to ChIP-seq data) that were DE in the Rv0081 KO
plot(rv0081.deg.ko[rv0081.regulon.de,"Log2.._Rv0081.WT."],mtb.exp[rv0081.regulon.de,"Rv0081"],main="Fig S4A",xlab="Log2 fold-change Rv0081 KO (Hypoxia)",ylab="Log2 fold-change Rv0081 over-expression (Normoxia)",col="red",pch=1,ylim=c(-3,5.5),xlim=c(-9.5,3.5))
other.de.genes.in.rv0081.ko<-intersect(rownames(mtb.exp),setdiff(rownames(rv0081.deg.ko),rv0081.regulon.de))
# Add genes that are not Rv0081 targets (according to ChIP-seq data)
points(rv0081.deg.ko[other.de.genes.in.rv0081.ko,"Log2.._Rv0081.WT."],mtb.exp[other.de.genes.in.rv0081.ko,"Rv0081"],col="dark green",pch=1)
abline(lty=2,col="grey",h=1)
abline(lty=2,col="grey",h=-1)
abline(lty=2,col="grey",v=1)
abline(lty=2,col="grey",v=-1)
legend("topleft",legend=c("Rv0081 target (ChIP-seq)","Not Rv0081 target (ChIP-seq)"),col=c("red","dark green"),pt.cex=1,bty="n",cex=0.65,pch=1)
```
6a. Create Fig 4A-B
```{r}
  top.ffl<-list(c("Rv0081","Rv3249c"),c("Rv0081","Rv0324"))
  #Rv0081-Rv3249c FFL
  tf.pair<-top.ffl[[1]]
  par(mfrow=c(1,3))
  #Depletion-associated genes
  depletion.genes<-gene.clusters[["Depletion"]] 
  not.DEGs<-setdiff(rownames(median.normalized.counts),DEGs.timecourse)
  main.tf.regulon<-mtb.chip[which(mtb.chip[,1]==tf.pair[1]),2]
  secondary.tf.regulon<-mtb.chip[which(mtb.chip[,1]==tf.pair[2]),2]
  ffl.targets<-intersect(main.tf.regulon,secondary.tf.regulon)
  depletion.genes.not.bound.by.main.tf<-setdiff(depletion.genes,main.tf.regulon)
  depletion.genes.bound.by.main.tf.not.second<-setdiff(intersect(depletion.genes,main.tf.regulon),ffl.targets)
  depletion.genes.bound.by.ffl<-intersect(depletion.genes,ffl.targets)
  hist(rv0081.deg.ko[intersect(depletion.genes,rownames(rv0081.deg.ko)),"Log2.._Rv0081.WT."],col="cadetblue2",main="Fig 4A",
       xlab="Log2 fold-change (Rv0081 KO vs WT)")
  boxplot(mtb.exp[not.DEGs,"Rv0081"],mtb.exp[depletion.genes.not.bound.by.main.tf,"Rv0081"],mtb.exp[depletion.genes.bound.by.main.tf.not.second,"Rv0081"],mtb.exp[depletion.genes.bound.by.ffl,"Rv0081"],las=2,names=c("A(1467)","B(352)","C(62)","D(32)"),col=c("grey",rep("cadetblue2",3)),ylab="Log2 fold-change (Rv0081 TFOE vs WT)",cex.axis=1,cex.lab=1,main="Fig 4B")
plot.new()
legend("topleft",legend=c("A:not DE in time course","B:not controlled by Rv0081","C:controlled by Rv0081 but not Rv3249c ","D:controlled by Rv0081-Rv3249c FFL"),col="white",pt.cex=1,bty="n",cex=0.74,pch=1)
```
6b. Create Fig 4C-D
```{r}
  #Rv0081-Rv0324
  tf.pair<-top.ffl[[2]]
  #Late hypoxia-associated genes
  late.genes<-gene.clusters[["Late"]] 
  main.tf.regulon<-mtb.chip[which(mtb.chip[,1]==tf.pair[1]),2]
  secondary.tf.regulon<-mtb.chip[which(mtb.chip[,1]==tf.pair[2]),2]
  ffl.targets<-intersect(main.tf.regulon,secondary.tf.regulon)
  late.genes.not.bound.by.main.tf<-setdiff(late.genes,main.tf.regulon)
  late.genes.bound.by.main.tf.not.second<-setdiff(intersect(late.genes,main.tf.regulon),ffl.targets)
  late.genes.bound.by.ffl<-intersect(late.genes,ffl.targets)
  par(mfrow=c(1,3))
  hist(rv0081.deg.ko[intersect(late.genes,rownames(rv0081.deg.ko)),"Log2.._Rv0081.WT."],col="peachpuff",main="Fig 4C",
       xlab="Log2 fold-change (Rv0081 KO vs WT)")
  boxplot(mtb.exp[not.DEGs,"Rv0081"],mtb.exp[late.genes.not.bound.by.main.tf,"Rv0081"],mtb.exp[late.genes.bound.by.main.tf.not.second,"Rv0081"],mtb.exp[late.genes.bound.by.ffl,"Rv0081"],names=c("A(1467)","B(818)","C(99)","D(61)"),las=2,col=c("grey",rep("peachpuff",3)),ylab="Log2 fold-change (Rv0081 TFOE vs WT)",cex.axis=1,cex.lab=1,main="Fig 4D")
plot.new()
legend("topleft",legend=c("A:not DE in time course","B:not controlled by Rv0081","C:controlled by Rv0081 but not Rv0324 ","D:controlled by Rv0081-Rv0324 FFL"),col="white",pt.cex=1,bty="n",cex=0.74,pch=1)
```
7. Generate input files for MotifNet server (FFL detection)
#MotifNet run is temporarily available in: http://netbio.bgu.ac.il/motifnet/#/explorer/1414
```{r}
#The input network is restricted to the interactions that involve DE TFs and their targets
input.network<-c()
#Filter the MTB ChIP-seq derived network used in Item # 5
for(k in 1:nrow(mtb.chip))
{
  current.tf<-mtb.chip[k,1]
  current.target<-mtb.chip[k,2]
  if(current.tf %in% DEGs.timecourse & current.target %in% DEGs.timecourse )
  {
    input.network<-rbind(input.network,mtb.chip[k,])
  }
}
input.nodes<-union(input.network[,1],input.network[,2])
#Write.files (optional)
#write.table(file="input_network.txt",input.network,quote = F,row.names = F,sep = "\t")
#write.table(file="input_nodes.txt",input.nodes,quote = F,row.names = F)
```
8a. Evaluate overlap between TF regulons and the identified transcriptional states - Part I
```{r}
regulons.enriched.with.own.state<-c()
for(z in de.tfs)
{
  current.regulon<-intersect(mtb.chip[which(mtb.chip[,1]==z),2],DEGs.timecourse)
  #Define the state of each DE TF
  #Default value is Normoxia
  tf.state<-1
  for(x in 2:6)
  {
  if(length(grep(z,gene.clusters[[x]]))>0)
  {
    tf.state<-x
  }
  }
  members.current.regulon.in.TF.state<-intersect(current.regulon,gene.clusters[[tf.state]])
  regulons.enriched.with.own.state.pvalue<-phyper(length(members.current.regulon.in.TF.state)-1,length(gene.clusters[[tf.state]]),length(DEGs.timecourse)-length(gene.clusters[[tf.state]]),length(current.regulon),lower.tail = F)
    if(regulons.enriched.with.own.state.pvalue <= 0.05)
    {
      regulons.enriched.with.own.state<-c(regulons.enriched.with.own.state,z)
    }
}
print(paste("There are",length(regulons.enriched.with.own.state), "regulons enriched with genes assigned to the same transcriptional state as the regulating TF",sep=" "))
#Permutation test to evaluate significance of enrichment instances
#Number of permutations
N=1000
count.regulons.randomly.enriched.with.same.state<-c()
for(w in 1:N)
{
#Shuffled DEGs state-membership
shuffled.degs<-DEGs.timecourse[sample(1:length(DEGs.timecourse),length(DEGs.timecourse))]
random.states<-list()
for(i in 1:6)
{
  random.states[[i]]<-shuffled.degs[1:length(gene.clusters[[i]])]
  shuffled.degs<-shuffled.degs[-1*(1:length(random.states[[i]]))]
}
#Evaluate random enrichment
regulons.randomly.enriched.with.own.state<-c()
for(z in de.tfs)
{
  current.regulon<-intersect(mtb.chip[which(mtb.chip[,1]==z),2],DEGs.timecourse)
  #Define the state of each DE TF
  #Default value is Normoxia
  tf.state<-1
  for(x in 2:6)
  {
  if(length(grep(z,random.states[[x]]))>0)
  {
    tf.state<-x
  }
  }
  members.current.regulon.in.random.TF.state<-intersect(current.regulon,random.states[[tf.state]])
  regulons.randomly.enriched.with.own.state.pvalue<-phyper(length(members.current.regulon.in.random.TF.state)-1,length(random.states[[tf.state]]),length(DEGs.timecourse)-length(random.states[[tf.state]]),length(current.regulon),lower.tail = F)
    if(regulons.randomly.enriched.with.own.state.pvalue <= 0.05)
    {
      regulons.randomly.enriched.with.own.state<-c(regulons.randomly.enriched.with.own.state,z)
    }
}
count.regulons.randomly.enriched.with.same.state<-c(count.regulons.randomly.enriched.with.same.state,length(regulons.randomly.enriched.with.own.state))
}
#Permutation p-value
print(length(which(count.regulons.randomly.enriched.with.same.state >= length(regulons.enriched.with.own.state))))
```
8b. Evaluate overlap between TF regulons and the identified transcriptional states - Part II
```{r}
regulons.enriched.with.other.state<-c()
for(z in de.tfs)
{
  current.regulon<-intersect(mtb.chip[which(mtb.chip[,1]==z),2],DEGs.timecourse)
  #Define the state of each DE TF
  #Default value is Normoxia
  tf.state<-1
  for(x in 2:6)
  {
  if(length(grep(z,gene.clusters[[x]]))>0)
  {
    tf.state<-x
  }
  }
  for(q in setdiff(1:6,tf.state))
  {
   members.current.regulon.in.other.state<-intersect(current.regulon,gene.clusters[[q]])
  enrichment.with.other.state.pvalue<-phyper(length(members.current.regulon.in.other.state)-1,length(gene.clusters[[q]]),length(DEGs.timecourse)-length(gene.clusters[[q]]),length(current.regulon),lower.tail = F)
    if(enrichment.with.other.state.pvalue <= 0.05)
    {
      regulons.enriched.with.other.state<-c(regulons.enriched.with.other.state,z)
    }
  }
}
print(paste("There are",length(unique(regulons.enriched.with.other.state)), "regulons enriched with genes assigned a transcriptional state different from the one of the regulating TF",sep=" "))
```